# ----------------------------------------------------------------------------------
# Comparing deep solar, tracking the sun, and google project sunroof

# Things that we might want to compare
# - Total installs:
#   + GPS: existing_installs_count
#   + LBNL: count of rows
#   + DS: solar_system_count(_residential + _nonresidential)
# - Number of panels:
#   + GPS: number_of_panels_median
#   + LBNL: total_panels
#   + DS: total_panel_area/(Standard panel size)
# - Median system size:
#   + GPS: yearly_sunlight_kwh_median
#   + LBNL: system_size_dc
#   + DS: 
# - Daily solar irradiation:
#   + GPS: yearly_sunlight_kwh_total
#   + LBNL: 
#   + DS: daily_solar_radiation
# - Incentives:
#   + GPS: 
#   + LBNL: subsidy
#   + DS: rebate

# ----------------------------------------------------------------------------------
library(pacman)
p_load(
  here, data.table, tigris, lubridate, dplyr, readxl, stringr, ggplot2, tidyr
)

options(tigris_use_cache = TRUE)
theme_set(hrbrthemes::theme_ipsum())

# Loading google project sunroof and deep solar -------------------------------------
google_dt = fread(here("Data/google-project-sunroof/project-sunroof-census_tract.csv")) %>% 
  .[,region_name := as.character(region_name)]
deep_solar_dt = fread(here("Data/deep-solar/deepsolar_tract.csv")) %>% 
  .[,fips := as.character(fips)]

# Loading tracking the sun ----------------------------------------------------------
# Reading in the raw data
install_dt_raw = rbind(
  fread(here("Data/LBNL-pv-installations/TTS_LBNL_public_file_14-Dec-2021_p1.csv")),
  fread(here("Data/LBNL-pv-installations/TTS_LBNL_public_file_14-Dec-2021_p2.csv"))
)

# Getting state fips codes and census divisions 
fips_dt = states(cb = TRUE) |> data.table() %>% 
  .[,.(state_fips = STATEFP, state = STUSPS, name = NAME)]

region_dt = fread(here("Data/census-regions.csv")) |>
  setnames(
    old = c("State","State Code","Region","Division"), 
    new = c("name","state","region","division")
  )

# Replacing all -1 with NA
install_dt_raw[install_dt_raw == -1] = NA

# Filtering to residential and 2000-2020 
install_dt = 
  install_dt_raw[
    customerSegment == "RES" & year(dmy(PTODate_orProxy_)) %in% 2000:2020
    ,
    .(
      customerSegment,
      zipcode = hostCustomerZip__4_,
      zip = str_sub(hostCustomerZip__4_, 1, 5),
      install_date = dmy(PTODate_orProxy_),
      city = hostCustomerCity,
      state,
      system_size_dc = systemSizeInDCSTC_KW_,
      total_cost = totalInstalledCost___,
      subsidy = Up_FrontCashIncentive___,
      cost_per_kw = totalInstalledCost___/systemSizeInDCSTC_KW_,
      total_panels = coalesce(moduleQty_1,0) + coalesce(moduleQty_2,0) + coalesce(moduleQty_3,0) 
    )
  ] |>
  merge(
    fips_dt, 
    by = "state",
    all.x = T
  ) |>
  merge(
    region_dt, 
    by = "state",
    all.x = T
  )

# Reading the crosswalk 
tract_zip_xwalk_dt = read_xlsx(
  here("Data/LBNL-pv-installations/ZIP_TRACT_122021.xlsx")
) |> data.table()

# First aggregating raw installs to zip code level
install_zip_dt = 
  install_dt[,
    .(
      total_installs = .N,
      total_kw = sum(system_size_dc, na.rm = T),
      median_kw = median(system_size_dc, na.rm = T),
      total_cost = sum(total_cost, na.rm = T),
      median_cost = median(total_cost, na.rm = T),
      total_subsidy = sum(subsidy, na.rm = T),
      median_subsidy = median(subsidy, na.rm = T),
      median_cost_per_kw = median(cost_per_kw, na.rm = T),
      total_panels = sum(total_panels, na.rm = T),
      median_tot_panels = median(total_panels, na.rm = T)
    ),
    by = .(zip, state)
  ] |>
  merge(
    tract_zip_xwalk_dt,
    by = "zip",
    all = T
  )

# Now summarizing by tract
install_tract_dt =
  install_zip_dt[
    !is.na(tract),
    lapply(.SD, \(x){sum(x * tot_ratio, na.rm = T)}),
    by = tract,
    .SDcols = str_subset(colnames(install_zip_dt),"median|total")
  ] 

# Combining everything together -----------------------------------------------------
solar_dt = 
  merge(
    deep_solar_dt,
    google_dt, 
    by.x = "fips" ,
    by.y = "region_name", 
    all = TRUE
  ) |>
  merge(
    install_tract_dt, 
    by.x = "fips",
    by.y = "tract",
    all = TRUE
  )

# Checking to see if they match well
solar_dt[,
  .N, 
  by = paste(
    if_else(is.na(state_name),"No GPS","GPS"), 
    if_else(is.na(total_installs), "No LBNL","LBNL"),
    if_else(is.na(county),"No DS","DS")
)]


# Comparing their measures ----------------------------------------------------------
# Total installs:
#   + GPS: existing_installs_count
#   + LBNL: count of rows
#   + DS: solar_system_count(_residential + _nonresidential)
solar_comp_dt = 
  solar_dt[,.(
    fips,
    # Total installs
    installs_gps = existing_installs_count, 
    installs_lbnl = total_installs, 
    installs_ds = solar_system_count,
    # Number of panels 
    panels_gps = number_of_panels_median,
    panels_lbnl = median_tot_panels, 
    panels_ds = total_panel_area/(65*39*0.0254^2), # Assuming panels 65x39in
    # Median system size
    kw_gps = yearly_sunlight_kwh_median,
    kw_lbnl = median_kw,
    # Daily solar ratiation
    radiation_gps = yearly_sunlight_kwh_total,
    radiation_ds = daily_solar_radiation*365.25
  )] |> 
    pivot_longer( 
      cols = starts_with(c("installs","panels","kw","radiation")), # Columns I want to pivot
      names_to = c(".value","source"), # Names for new columns
      names_pattern = '(.*)_(gps|lbnl|ds)' # Fill in ".value" with correct name
    ) |> data.table() 


# Total installs
merge(
  solar_comp_dt[source == "lbnl"],
  solar_comp_dt[source != "lbnl"],
  by = "fips"
) |>
ggplot(aes(x = installs.x, color = source.y)) + 
  geom_smooth(aes(y = installs.y)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  scale_color_viridis_d(
    name = "", begin = 0.25, end = 0.75,
    labels = c("DeepSolar","Google"),
    option = "inferno"
  ) +
  labs(
    title = "Total Installs",
    x = "Tracking the Sun Installs",
    y = "Satellite Installs"
  )

merge(
  solar_comp_dt[source == "lbnl"],
  solar_comp_dt[source == "gps"],
  by = "fips"
) |>
  ggplot(aes(x = installs.x)) + 
  geom_smooth(aes(y = installs.y)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  labs(
    title = "Total Installs",
    x = "Deep Solar Installs",
    y = "Google Project Sunroof Installs"
  )


# - Number of panels:
merge(
  solar_comp_dt[source == "lbnl"],
  solar_comp_dt[source != "lbnl"],
  by = "fips"
) |>
  ggplot(aes(x = panels.x, color = source.y)) + 
  geom_smooth(aes(y = panels.y)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  scale_color_viridis_d(
    name = "", begin = 0.25, end = 0.75,
    labels = c("DeepSolar","Google"),
    option = "inferno"
  ) +
  xlim(0,2000)+
  labs(
    title = "Total Panels",
    x = "Tracking the Sun Panels",
    y = "Satellite Panels"
  )

# - Median system size:
merge(
  solar_comp_dt[source == "lbnl"],
  solar_comp_dt[source != "lbnl"],
  by = "fips"
) |>
  ggplot(aes(x = kw.x, color = source.y)) + 
  geom_smooth(aes(y = kw.y)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  scale_color_viridis_d(
    name = "", begin = 0.25, end = 0.75,
    labels = c("DeepSolar","Google"),
    option = "inferno"
  ) +
  labs(
    title = "Total Installs",
    x = "Tracking the Sun Installs",
    y = "Satellite Installs"
  )

# - Daily solar irradiation:
merge(
  solar_comp_dt[source == "ds"],
  solar_comp_dt[source == "gps"],
  by = "fips"
) |>
  ggplot(aes(x = radiation.x)) + 
  geom_smooth(aes(y = radiation.y)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  labs(
    title = "Solar Radiation",
    x = "DeepSolar",
    y = "Google Project Sunroof"
  )
# - Incentives:
#   + GPS: 
#   + LBNL: subsidy
#   + DS: rebate






